In [ ]:
strs = ""
excludedCols = ["cvrNummer","label","status","navn","kortBeskrivelse"]
for i in logDf.columns:
if i not in excludedCols:
strs += i+" + "
#excludedCols
imputedDf = logDf.fillna(value=0.0)
formula = RFormula(formula="label ~ "+strs[:-3],labelCol="label")
glr = GeneralizedLinearRegression(family="binomial", link="logit", maxIter=10, regParam=0.3)
standardScale = StandardScaler(withMean=True,withStd=True,inputCol=glr.getFeaturesCol(),outputCol="scaledFeatures")
pipeline = Pipeline(stages=[formula,standardScale,glr])
grid = (ParamGridBuilder()
.baseOn({lr.predictionCol:"prediction"})
.baseOn({lr.rawPredictionCol:"rawPrediction"})
.baseOn({lr.probabilityCol:"probability"})
.baseOn({lr.labelCol:"label"})
.baseOn({lr.featuresCol:"features"})
.addGrid(param=lr.elasticNetParam,values=[0.1,1.0])
.addGrid(param=lr.getMaxIter,values=[10])
.build()
)
evaluate = BinaryClassificationEvaluator()
trainEvalModel = TrainValidationSplit(estimator=pipeline,estimatorParamMaps=grid,evaluator=evaluate,trainRatio=0.8)
In [ ]:
cols = [i for i in logDf.columns if i not in excludedCols]+["label"]
model = pipeline.fit(imputedDf.filter(F.col("label") <= 1).select(*cols))
In [ ]:
predict = model.transform(imputedDf.select(*cols).filter(F.col("label") <= 1))
coef = model.stages[-1]
In [ ]:
p = model.stages[-1].summary
print("Coefficient Standard Errors: " + str(p.coefficientStandardErrors))
print("T Values: " + str(p.tValues))
print("P Values: " + str(p.pValues))
print("Dispersion: " + str(p.dispersion))
print("Null Deviance: " + str(p.nullDeviance))
print("Residual Degree Of Freedom Null: " + str(p.residualDegreeOfFreedomNull))
print("Deviance: " + str(p.deviance))
print("Residual Degree Of Freedom: " + str(p.residualDegreeOfFreedom))
print("AIC: " + str(p.aic))
print("Deviance Residuals: ")
p.residuals().show()
In [ ]:
print(len(cols))
print(type(coef.coefficients.toArray()))
print()
summary = {"Labels":cols[:-1]+["intercept"],"Coefficients":np.insert(coef.coefficients.toArray(),0,coef.intercept),"coefficient Std Err":p.coefficientStandardErrors,"T Values":p.tValues,"P Values":p.pValues}
In [ ]:
pd.options.display.float_format = '{:,.4f}'.format
df = pd.DataFrame(summary,columns=["Labels","Coefficients","coefficient Std Err","T Values","P Values"])
import subprocess
HEADER = '''
<html>
<head>
<style>
.df tbody
</style>
</head>
<body>
'''
FOOTER = '''
</body>
</html>
'''
#df = pd.DataFrame({'a': np.arange(10), 'b': np.random.randn(10)})
with open('test.html', 'w') as f:
f.write(HEADER)
f.write(df.to_html(classes='df'))
f.write(FOOTER)
In [ ]: